library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(xgboost)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.2.0 ✔ purrr 0.3.2
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 0.8.3 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ dplyr::slice() masks xgboost::slice()
## ✖ lubridate::union() masks base::union()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(segmented)
source('functions.r')
load("Table_construction.Rdata")
set.seed(12)
features = features %>%
# Add other useful information:
inner_join(
data_before %>%
select(person_id, screening_date, people) %>%
unnest() %>%
select(person_id, screening_date, race, sex, name),
by = c("person_id","screening_date")
) %>%
inner_join(features_on, by = c("person_id","screening_date")) %>%
inner_join(outcomes, by = c("person_id","screening_date")) %>%
# select(-c(offenses_within_30.y)) %>%
# Create as many features as possible:
mutate(
raw_score = `Risk of Recidivism_raw_score`, # Adjust for violent/general
decile_score = `Risk of Recidivism_decile_score`, # Adjust for violent/general
p_jail30 = pmin(p_jail30,5),
p_prison30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
# Subscales:
crim_inv = p_arrest+
p_jail30+
p_prison+
p_probation,
# Filters (TRUE for obserations to keep)
filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1,
filt2 = offenses_within_30 == 1,
filt3 = !is.na(current_offense_date),
filt4 = current_offense_date <= current_offense_date_limit,
filt5 = p_current_age > 18 & p_current_age <= 65,
filt6 = crim_inv == 0
)
Modelling the COMPAS Risk of Recidivism score with a quadratic poly.
#filter out any individuals with crim inv history
features_age_poly = features %>%
filter(filt1,filt5, filt6)
lb_age = features_age_poly %>%
group_by(p_current_age) %>%
arrange(raw_score) %>%
top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value for each age
mdl_age = lm(raw_score ~
I(p_current_age^2) +
p_current_age,
data=lb_age)
# More precision for paper
summary(mdl_age)
##
## Call:
## lm(formula = raw_score ~ I(p_current_age^2) + p_current_age,
## data = lb_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.213689 -0.016285 0.006031 0.027235 0.080507
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.816e-02 5.402e-02 -1.817 0.0721 .
## I(p_current_age^2) 4.291e-04 3.229e-05 13.287 <2e-16 ***
## p_current_age -7.178e-02 2.728e-03 -26.313 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04547 on 103 degrees of freedom
## Multiple R-squared: 0.9894, Adjusted R-squared: 0.9892
## F-statistic: 4789 on 2 and 103 DF, p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "-9.81565971191264569073e-02" "4.29064498817958452602e-04"
## [3] "-7.17796337351976898589e-02"
## Add f(age) to features
features = features %>%
mutate(
f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_poly = raw_score - f_age,
filt7 = raw_score >= f_age - 0.05
)
## Add same filters to lb_age
lb_age = lb_age %>%
mutate(
f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_poly = raw_score - f_age,
filt7 = raw_score >= f_age - 0.05
)
Plotting settings
xmin = 18
xmax = 65
xx = seq(xmin,xmax, length.out = 1000)
Generate a preliminary plot of age vs COMPAS general score
ggplot()+
geom_point(aes(x=p_current_age, raw_score, color = factor(filt7)),alpha=.3, data=lb_age) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_age_poly) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
features_age_spline = features %>%
filter(filt1, filt5, filt6, filt7)
lb_filt = features_age_spline %>%
group_by(p_current_age) %>%
arrange(raw_score)%>%
top_n(n=-1, wt = raw_score)
#filtered out points
lb_outliers = rbind(
#reason not included in lb_filt was
#age not above age poly
setdiff(lb_age, lb_filt) %>%
filter(!filt7) %>% #below age poly
mutate(reason = "Below age polynomial")
)
lb = lb_filt %>%
select(p_current_age, p_age_first_offense, raw_score) %>%
mutate(reason = "Used to construct linear splines") %>%
rbind(lb_outliers)
Generating linear splines to fit the lower bound Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.
mdl_age_spline <- segmented(lm(raw_score ~ p_current_age, data = lb_filt),
seg.Z = ~p_current_age, psi = list(p_current_age = c(39,58)),
control = seg.control(display = FALSE)
)
#Add Filter 8
features = features %>%
mutate(
age_spline = predict(mdl_age_spline, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_spline = raw_score - age_spline,
filt8 = raw_score >= age_spline - 0.05
)
intercept(mdl_age_spline)
## $p_current_age
## Est.
## intercept1 -0.28215
## intercept2 -0.98923
## intercept3 -1.45920
slope(mdl_age_spline)
## $p_current_age
## Est. St.Err. t value CI(95%).l CI(95%).u
## slope1 -0.052747 0.00083055 -63.509 -0.054394 -0.051100
## slope2 -0.032251 0.00081987 -39.337 -0.033876 -0.030625
## slope3 -0.022852 0.00090704 -25.194 -0.024650 -0.021054
summary.segmented(mdl_age_spline)$psi
## Initial Est. St.Err
## psi1.p_current_age 39 34.49774 0.4479371
## psi2.p_current_age 58 49.99995 1.1230493
Note: the reason why the number of points below the age polynomial appear different between the two plots is that the first plot (displaying the points used to construct the linear spline) show only data points where criminal involvement = 0, whereas the second plot does not have this constraint. In other words, the first plot has filt6 applied whereas the second plot does not.
break1 = summary.segmented(mdl_age_spline)$psi[1,2]
break2 = summary.segmented(mdl_age_spline)$psi[2,2]
xx1 = seq(xmin,break1, length.out=1000)
xx2 = seq(break1,break2, length.out=1000)
xx3 = seq(break2,xmax, length.out=1000)
age_spline_recid =
ggplot()+
geom_point(aes(x=p_current_age, raw_score, color= factor(reason)),alpha=.5, data=lb ) +
scale_color_manual(values=c("red", "grey25")) +
geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
color="darkorange1", size = 1) +
geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
color="cyan3", size = 1) +
geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
color="seagreen3", size = 1) +
theme_bw()+
xlim(xmin,xmax)+
xlab("\n Age at COMPAS screening date") +
ylab("General Score \n") +
theme(text = element_text(size=9),
axis.text=element_text(size=7),
legend.position = c(.95,.95),
legend.title = element_blank(),
legend.justification = c("right", "top"),
legend.key = element_rect(fill = "aliceblue"),
legend.background = element_rect(fill="aliceblue",
size=0.5, linetype="solid")
)
age_spline_recid
# Age spline with all data (filters 1,5 still applied for data quality)
# Red points are below the age polynomial not age spline
age_spline_recid_all =
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#F8766D", data=features %>% filter(filt1,filt5,!filt7)) + # Age outliers
geom_point(aes(x=p_current_age, raw_score), color="#619CFF", alpha=.3, data=features %>% filter(filt1,filt5,filt7)) + # Not age outliers
geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
color="#F8766D", size = 1) +
geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
color="#F8766D", size = 1) +
geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
color="#F8766D", size = 1) +
theme_bw()+
xlim(xmin,xmax)+
xlab("\n Age at COMPAS screening date") +
ylab("General score \n") +
theme(text = element_text(size=9),
axis.text=element_text(size=7),
legend.position="top")
age_spline_recid_all
ggsave("Figures/age_LB_general.pdf", plot = age_spline_recid, width = 3.5, height = 2.5, units = "in")
ggsave("Figures/age_LB_alldata_general.pdf", plot = age_spline_recid_all, width = 3.5, height = 2.5, units = "in")
prior_charges_LB_general = features %>%
filter(filt1,filt3) %>%
select(p_charge, raw_score__age_spline, filt8) %>%
ggplot() +
geom_point(aes(x=p_charge,y=raw_score__age_spline,color=filt8),alpha=.5) +
xlim(c(0,45))+
theme_bw()+
xlab("Number of prior charges") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]),
expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
prior_charges_LB_general
## Warning: Removed 69 rows containing missing values (geom_point).
prior_arrests_LB_general = features %>%
filter(filt1,filt3) %>%
select(p_arrest, raw_score__age_spline, filt8) %>%
ggplot() +
geom_point(aes(x=p_arrest,y=raw_score__age_spline,color=filt8),alpha=.5) +
xlim(c(0,45))+
theme_bw()+
xlab("Number of prior arrests") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]),
expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
prior_arrests_LB_general
## Warning: Removed 2 rows containing missing values (geom_point).
ggsave("Figures/prior_arrests_LB_general.pdf", plot = prior_arrests_LB_general, width = 3.5, height = 2.5, units = "in")
## Warning: Removed 2 rows containing missing values (geom_point).
ggsave("Figures/prior_charges_LB_general.pdf", plot=prior_charges_LB_general, width = 3.5, height = 2.5, units = "in")
## Warning: Removed 69 rows containing missing values (geom_point).
features %>%
filter(filt1,filt3, filt5) %>%
select(crim_inv, raw_score__age_spline, filt8) %>%
ggplot() +
geom_point(aes(x=crim_inv,y=raw_score__age_spline,color=filt8),alpha=.5) +
xlim(c(0,45))+
theme_bw()+
xlab("Sum of Criminal Involvement Components") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]), expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 9 rows containing missing values (geom_point).
ggsave("Figures/crim_inv_LB_general_arrests.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 9 rows containing missing values (geom_point).
Try various weightings of inputs criminal involvement subscale (i.e., instead of evenly weighted) (optional)
dir.create("./Figures/crim_inv_weightings_general/")
crim_inv_weights_perms = expand.grid(arrest=seq(1,3,1),
jail30=seq(1,3,1),
prison=seq(1,3,1),
probation=seq(1,3,1))
for (i in 1:nrow(crim_inv_weights_perms)){
weights = unlist(crim_inv_weights_perms[i,])
title = paste0('arrest=',weights['arrest'],
'_jail30=',weights['jail30'],
'_prison=',weights['prison'],
'_probation=',weights['probation'])
p <- features %>%
mutate(crim_inv =
weights['arrest']*p_arrest+
weights['jail30']*p_jail30+
weights['prison']*p_prison+
weights['probation']*p_probation) %>%
filter(filt1,filt3, filt5) %>%
select(crim_inv, raw_score__age_spline, filt8) %>%
ggplot() +
geom_point(aes(x=crim_inv,y=raw_score__age_spline,color=filt8),alpha=.5) +
xlim(c(0,45))+
theme_bw()+
xlab("Sum of Criminal Involvement Components") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
plot.title = element_text(size=8),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]), expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38")) +
ggtitle(title)
ggsave(paste0("Figures/crim_inv_weightings_general/",title,".png"),plot=p,width = 3.5, height = 2.5, units = "in")
}
# filter out datapoints low quality
# filt 8 will be applied later
features_crim_inv_poly = features %>%
filter(filt1, filt3, filt5, crim_inv <= 40) # convert to actual filter later
lb_crim_inv = features_crim_inv_poly %>%
filter(filt8) %>%
select(-filt8) %>%
# select(crim_inv, raw_score__age_spline) %>%
group_by(crim_inv) %>%
arrange(raw_score__age_spline) %>%
top_n(n=-4, wt=raw_score__age_spline) # Fit lower bound on smallest value for each crim inv component
# write.csv(lb_crim_inv, "../lb_crim_inv.csv")
# predict the compas age remainder
mdl_crim_inv = lm(raw_score__age_spline ~
I(crim_inv^3) +
I(crim_inv^2) +
crim_inv,
data=lb_crim_inv)
# More precision for paper
summary(mdl_crim_inv)
##
## Call:
## lm(formula = raw_score__age_spline ~ I(crim_inv^3) + I(crim_inv^2) +
## crim_inv, data = lb_crim_inv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.06727 -0.10168 0.03038 0.15386 0.57592
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.816e-01 7.165e-02 -2.534 0.0122 *
## I(crim_inv^3) 2.570e-05 1.532e-05 1.678 0.0954 .
## I(crim_inv^2) -2.222e-03 9.280e-04 -2.394 0.0178 *
## crim_inv 1.193e-01 1.578e-02 7.559 3.03e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2502 on 159 degrees of freedom
## Multiple R-squared: 0.9136, Adjusted R-squared: 0.9119
## F-statistic: 560.1 on 3 and 159 DF, p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_crim_inv$coefficients) # More precision for paper
## [1] "-1.81561193977485685336e-01" "2.57044321819749947103e-05"
## [3] "-2.22161522877928341302e-03" "1.19300684990506777883e-01"
## Add g(crim_inv) to features
# note that g(crim_inv) fits the lower bound of the COMPAS GENERAL AGE REMAINDER
features = features %>%
mutate(
g_crim_inv = predict(mdl_crim_inv, newdata=data.frame(crim_inv=crim_inv)),
raw_score__age_spline__g_vio_hist = raw_score__age_spline - g_crim_inv ,
filt9 = raw_score__age_spline >= g_crim_inv - 0.05
)
## Add same filters to lb_crim_inv
lb_crim_inv = lb_crim_inv %>%
mutate(
g_crim_inv = predict(mdl_crim_inv, newdata=data.frame(crim_inv=crim_inv)),
raw_score__age_spline__g_vio_hist = raw_score__age_spline - g_crim_inv ,
filt9 = raw_score__age_spline >= g_crim_inv - 0.05
)
Plotting settings
xmin_crim_inv = 0
xmax_crim_inv = 40
xx_crim_inv = seq(xmin_crim_inv,xmax_crim_inv, length.out = 1000)
Generate a preliminary plot of crim inv vs COMPAS general score remainder
ggplot()+
geom_point(aes(x=crim_inv, raw_score__age_spline, color = factor(filt9)),alpha=.3, data=lb_crim_inv) +
geom_line(aes(x=xx_crim_inv, predict(mdl_crim_inv, newdata=data.frame(crim_inv=xx_crim_inv))),color="#F8766D") +
theme_bw()+
xlim(xmin_crim_inv,xmax_crim_inv)+
xlab("Criminal Involvement Subscale Sum") +
ylab("General score - f_age") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
ggplot()+
geom_point(aes(x=crim_inv, raw_score__age_spline), color="#619CFF",alpha=.3, data=features_crim_inv_poly) +
geom_line(aes(x=xx_crim_inv, predict(mdl_crim_inv, newdata=data.frame(crim_inv=xx_crim_inv))),color="#F8766D") +
theme_bw()+
xlim(xmin_crim_inv,xmax_crim_inv)+
xlab("Criminal Involvement Subscale Sum") +
ylab("General score - f_age") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
# unfiltered
ggplot()+
geom_point(aes(x=crim_inv, raw_score__age_spline), color="#619CFF",alpha=.3, data=features) +
theme_bw()+
xlim(xmin_crim_inv,xmax_crim_inv)+
ylim(-0.5, 4) +
xlab("Criminal Involvement Subscale Sum -- Filtered ") +
ylab("General score - f_age") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
## Warning: Removed 38 rows containing missing values (geom_point).
# filtered
ggplot()+
geom_point(aes(x=crim_inv, raw_score__age_spline, color=factor(filt9)),alpha=.3, data=features) +
theme_bw()+
xlim(xmin_crim_inv,xmax_crim_inv) +
ylim(-0.5, 4) +
xlab("Criminal Involvement Subscale Sum -- Filtered ") +
ylab("General score - f_age") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
## Warning: Removed 38 rows containing missing values (geom_point).
Want to keep track of points that are below the age splines and points that are below the crim inv polynomial
features_crim_inv_spline = features %>%
filter(filt1, filt3, filt5, filt8, filt9, crim_inv <=40)
# points we will use to construct linear crim inv spline
lb_filt_crim_inv = features_crim_inv_spline %>%
group_by(crim_inv) %>%
arrange(raw_score__age_spline)%>%
top_n(n=-1, wt = raw_score__age_spline)
# points filtered out because they are below the age spline
lb_age_spline_outliers = features %>%
filter(filt1, filt3, filt5, !filt8, crim_inv <= 40) %>%
mutate(reason = "Below age spline")
# points filtered out because they are below the crim. inv. poly.
lb_crim_inv_poly_outliers = features %>%
filter(filt1, filt3, filt5, filt8, !filt9, crim_inv <= 40) %>%
mutate(reason = "Below crim. inv. polynomial")
lb_crim_inv_all = lb_filt_crim_inv %>%
mutate(reason = "Used to construct linear splines") %>%
rbind(list(lb_age_spline_outliers, lb_crim_inv_poly_outliers)) %>%
select(person_id, crim_inv, raw_score__age_spline, reason)
rm(lb_age_spline_outliers, lb_crim_inv_poly_outliers)
Generating linear splines to fit the lower bound of the COMPAS age remainder Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.
mdl_crim_inv_spline <- segmented(lm(raw_score__age_spline ~ crim_inv, data = lb_filt_crim_inv),
seg.Z = ~crim_inv, psi = list(crim_inv = c(9)),
control = seg.control(display = FALSE)
)
#Add Filter 10
features = features %>%
mutate(
crim_inv_spline = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=crim_inv)),
raw_score__age_spline__crim_inv_spline = raw_score__age_spline - crim_inv_spline,
filt10 = raw_score__age_spline >= crim_inv_spline - 0.05
)
intercept(mdl_crim_inv_spline)
## $crim_inv
## Est.
## intercept1 -0.12241
## intercept2 0.50939
slope(mdl_crim_inv_spline)
## $crim_inv
## Est. St.Err. t value CI(95%).l CI(95%).u
## slope1 0.097854 0.0051412 19.033 0.087437 0.108270
## slope2 0.055734 0.0022495 24.776 0.051176 0.060292
summary.segmented(mdl_crim_inv_spline)$psi
## Initial Est. St.Err
## psi1.crim_inv 9 14.99998 1.355633
Plot crim inv spline
break1 = summary.segmented(mdl_crim_inv_spline)$psi[1,2]
xx1 = seq(xmin_crim_inv,break1, length.out=1000)
xx2 = seq(break1,xmax_crim_inv, length.out=1000)
crim_inv_spline_recid =
ggplot()+
geom_point(aes(x=crim_inv, raw_score__age_spline, color= factor(reason)),alpha=.5, data=lb_crim_inv_all ) +
scale_color_manual(values=c("green", "red", "grey25")) +
geom_line(aes(x=xx1, y = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=xx1))),
color="darkorange1", size = 1) +
geom_line(aes(x=xx2, y = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=xx2))),
color="cyan3", size = 1) +
theme_bw()+
xlim(xmin_crim_inv,xmax_crim_inv)+
xlab("Sum of Criminal Involvement \n Subscale Components") +
ylab("General Score - f_age \n") +
theme(text = element_text(size=9),
axis.text=element_text(size=7),
legend.position = c(0,1),
legend.title = element_blank(),
legend.justification = c("left", "top"),
legend.key = element_rect(fill = NA),
legend.background = element_rect(fill=NA,
size=0.2,
linetype="solid")
)
crim_inv_spline_recid
# Crim inv spline with all data (filters 1,5 still applied for data quality)
crim_inv_spline_recid_all =
ggplot()+
geom_point(aes(x=crim_inv, raw_score__age_spline), # age spline outliers
color="green",
data=features %>% filter(filt1, filt3, filt5,!filt8)) +
geom_point(aes(x=crim_inv, raw_score__age_spline), # crim inv poly outliers
color="red", alpha=.3,
data=features %>% filter(filt1, filt3, filt5,filt8, !filt9)) +
geom_point(aes(x=crim_inv, raw_score__age_spline), # not outliers
color="#619CFF", alpha=.3,
data=features %>% filter(filt1, filt3, filt5,filt8, filt9)) +
geom_line(aes(x=xx_crim_inv, y = predict(mdl_crim_inv_spline, newdata=data.frame(crim_inv=xx_crim_inv))),
color="#F8766D", size = 1) +
theme_bw()+
xlim(xmin_crim_inv,xmax_crim_inv)+
xlab("Sum of Criminal Involvement\n Subscale Components") +
ylab("General score - f_age \n") +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top")
crim_inv_spline_recid_all
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
ggsave("Figures/crim_inv_spline_general.pdf", plot = crim_inv_spline_recid, width = 3.5, height = 2.5, units = "in")
ggsave("Figures/crim_inv_spline_alldata_general.pdf", plot = crim_inv_spline_recid_all, width = 3.5, height = 2.5, units = "in")
## Warning: Removed 18 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_point).
There are a few groups of predictors we will use: * Group 0: without using age variables or race * Group 1: without using current age or race but with age at first arrest * Group 2: without using current age but with race and age at first arrest * Group 3: without using race but with current age and age at first arrest * Group 4: using age variables and race * Group 5: test
#### Generic stuff (applies to all models)
## Filter rows
features_filt = features %>%
filter(filt1, filt3)
## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
eval_metric = "rmse",
eta = c(.05,.1),
gamma = c(.5, 1),
max_depth = c(2,5),
min_child_weight = c(5,10),
subsample = c(1),
colsample_bytree = c(1)
)
# svm
param_svm = list(
type = 'eps-regression',
cost = c(0.5,1,2),
epsilon = c(0.5,1,1.5),
gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 0:8, lm = NA, xgb = NA, rf = NA, svm = NA)
### Create group 0 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
# p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.42430 -0.45339 -0.05984 0.38049 2.37584
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.875979 0.006968 125.717 <2e-16 ***
## p_arrest 0.059410 0.001993 29.809 <2e-16 ***
## p_jail30 -0.084580 0.040735 -2.076 0.0379 *
## p_prison 0.171827 0.008814 19.494 <2e-16 ***
## p_probation 0.072067 0.008348 8.633 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5765 on 9037 degrees of freedom
## Multiple R-squared: 0.4038, Adjusted R-squared: 0.4036
## F-statistic: 1530 on 4 and 9037 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==0,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==0,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==0,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 20
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.4"
res_rmse[res_rmse$Group==0,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 1 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.09844 -0.44649 -0.06349 0.36243 2.54534
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0933977 0.0173633 62.972 < 2e-16 ***
## p_age_first_offense -0.0075173 0.0005509 -13.645 < 2e-16 ***
## p_arrest 0.0556322 0.0019923 27.924 < 2e-16 ***
## p_jail30 -0.0663080 0.0403463 -1.643 0.1
## p_prison 0.1767267 0.0087328 20.237 < 2e-16 ***
## p_probation 0.0661198 0.0082754 7.990 1.52e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5707 on 9036 degrees of freedom
## Multiple R-squared: 0.4159, Adjusted R-squared: 0.4156
## F-statistic: 1287 on 5 and 9036 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 16
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "1"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 19
## type "eps-regression"
## cost "0.5"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.3333333"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.93704 -0.43505 -0.06018 0.35537 2.40025
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7715142 0.0294567 26.192 < 2e-16 ***
## p_age_first_offense -0.0056361 0.0005609 -10.048 < 2e-16 ***
## p_arrest 0.0535447 0.0019642 27.260 < 2e-16 ***
## p_jail30 -0.0502479 0.0396846 -1.266 0.20548
## p_prison 0.1687020 0.0086064 19.602 < 2e-16 ***
## p_probation 0.0653331 0.0081413 8.025 1.14e-15 ***
## race_black 0.3672723 0.0255076 14.399 < 2e-16 ***
## race_white 0.2504831 0.0257782 9.717 < 2e-16 ***
## race_hispanic 0.1000628 0.0309000 3.238 0.00121 **
## race_asian 0.0958429 0.0852294 1.125 0.26082
## race_native 0.2440703 0.1087937 2.243 0.02489 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.561 on 9031 degrees of freedom
## Multiple R-squared: 0.4358, Adjusted R-squared: 0.4352
## F-statistic: 697.7 on 10 and 9031 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
data.frame(xgboost = pred, compas=features_filt$raw_score) %>%
ggplot() +
geom_point(aes(x=xgboost,y=compas), alpha=.3) +
theme_bw()+
xlab("XGBoost prediction") +
ylab("COMPAS raw score")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(6778)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 21
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1818182"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.83634 -0.44132 -0.06739 0.35870 2.55283
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.059171 0.017881 59.235 < 2e-16 ***
## p_current_age 0.009175 0.001203 7.627 2.64e-14 ***
## p_age_first_offense -0.016188 0.001263 -12.822 < 2e-16 ***
## p_arrest 0.051133 0.002072 24.681 < 2e-16 ***
## p_jail30 -0.039529 0.040372 -0.979 0.328
## p_prison 0.159596 0.008990 17.752 < 2e-16 ***
## p_probation 0.052787 0.008432 6.260 4.03e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5689 on 9035 degrees of freedom
## Multiple R-squared: 0.4196, Adjusted R-squared: 0.4192
## F-statistic: 1089 on 6 and 9035 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(5)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 20
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.2857143"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76345 -0.43064 -0.06504 0.35500 2.40477
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.737217 0.029694 24.827 < 2e-16 ***
## p_current_age 0.009171 0.001185 7.736 1.13e-14 ***
## p_age_first_offense -0.014257 0.001247 -11.435 < 2e-16 ***
## p_arrest 0.049026 0.002043 23.995 < 2e-16 ***
## p_jail30 -0.023093 0.039711 -0.582 0.560901
## p_prison 0.151392 0.008865 17.077 < 2e-16 ***
## p_probation 0.052113 0.008293 6.284 3.45e-10 ***
## race_black 0.367906 0.025425 14.470 < 2e-16 ***
## race_white 0.245145 0.025704 9.537 < 2e-16 ***
## race_hispanic 0.104469 0.030805 3.391 0.000699 ***
## race_asian 0.103146 0.084958 1.214 0.224749
## race_native 0.235458 0.108447 2.171 0.029943 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5592 on 9030 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4389
## F-statistic: 643.8 on 11 and 9030 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 16
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "1"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12))
ggsave("Figures/raw-fage_xgboost_gp4_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
highlight = data.frame(
person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
highlight = TRUE
)
df_plot = features_filt %>%
bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
left_join(highlight, by = c("person_id","screening_date")) %>%
mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))
person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)
ggplot() +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), data = filter(df_plot, highlight=="In Table 5")) +
theme_bw()+
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) +
xlab(expression(XGBoost~pred.~of~general~score~-~f[age])) +
ylab("General score")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
#legend.position = "top",
legend.position="none") +
scale_color_discrete(name = element_blank()) +
xlim(0.2,3.5)
ggsave("Figures/xgboost_rawScore_annotate_general.pdf",width = 4, height = 4, units = "in")
set.seed(3720)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 21
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1666667"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 5 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.76345 -0.43064 -0.06504 0.35500 2.40477
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.737217 0.029694 24.827 < 2e-16 ***
## p_current_age 0.009171 0.001185 7.736 1.13e-14 ***
## p_age_first_offense -0.014257 0.001247 -11.435 < 2e-16 ***
## p_arrest 0.049026 0.002043 23.995 < 2e-16 ***
## p_jail30 -0.023093 0.039711 -0.582 0.560901
## p_prison30 NA NA NA NA
## p_prison 0.151392 0.008865 17.077 < 2e-16 ***
## p_probation 0.052113 0.008293 6.284 3.45e-10 ***
## race_black 0.367906 0.025425 14.470 < 2e-16 ***
## race_white 0.245145 0.025704 9.537 < 2e-16 ***
## race_hispanic 0.104469 0.030805 3.391 0.000699 ***
## race_asian 0.103146 0.084958 1.214 0.224749
## race_native 0.235458 0.108447 2.171 0.029943 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5592 on 9030 degrees of freedom
## Multiple R-squared: 0.4396, Adjusted R-squared: 0.4389
## F-statistic: 643.8 on 11 and 9030 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(1123)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 21
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1538462"
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 0 | 0.5763262 | 0.5197590 | 0.5520113 | 0.5204621 |
| 1 | 0.5704792 | 0.5089603 | 0.5430797 | 0.5125682 |
| 2 | 0.5606448 | 0.4970774 | 0.5118637 | 0.5024905 |
| 3 | 0.5686515 | 0.4987465 | 0.5141315 | 0.5042145 |
| 4 | 0.5587960 | 0.4940864 | 0.5071319 | 0.4961323 |
| 5 | 0.5587960 | 0.4895826 | 0.5061418 | 0.4967642 |
| 6 | NA | NA | NA | NA |
| 7 | NA | NA | NA | NA |
| 8 | NA | NA | NA | NA |
There are a few groups of predictors we will use: * Group 6: using criminal history (only age variables) * Group 7: without age variables (only criminal history) * Group 8: with everything (age and criminal history)
Race not included in any of groups 6-8.
### Create group 6 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
#p_arrest,
#p_jail30,
#p_prison,
#p_probation,
raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline__crim_inv_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline__crim_inv_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5944 -0.4298 -0.0861 0.3402 2.5345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0696061 0.0175379 60.988 < 2e-16 ***
## p_current_age -0.0064197 0.0008706 -7.374 1.81e-13 ***
## p_age_first_offense 0.0018659 0.0009013 2.070 0.0385 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.568 on 9039 degrees of freedom
## Multiple R-squared: 0.01081, Adjusted R-squared: 0.01059
## F-statistic: 49.4 on 2 and 9039 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==6,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 13
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
res_rmse[res_rmse$Group==6,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__age_spline__crim_inv_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==6,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline__crim_inv_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 24
## type "eps-regression"
## cost "2"
## epsilon "1"
## gamma_scale "2"
## gamma "0.6666667"
res_rmse[res_rmse$Group==6,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 7 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
# p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline__crim_inv_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline__crim_inv_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80776 -0.43212 -0.07162 0.35501 2.40310
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.971128 0.006747 143.926 < 2e-16 ***
## p_arrest -0.018578 0.001930 -9.626 < 2e-16 ***
## p_jail30 -0.097527 0.039446 -2.472 0.0134 *
## p_prison 0.092803 0.008535 10.873 < 2e-16 ***
## p_probation -0.043673 0.008084 -5.402 6.74e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5582 on 9037 degrees of freedom
## Multiple R-squared: 0.04486, Adjusted R-squared: 0.04443
## F-statistic: 106.1 on 4 and 9037 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==7,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 6
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
res_rmse[res_rmse$Group==7,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__age_spline__crim_inv_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==7,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline__crim_inv_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 23
## type "eps-regression"
## cost "1"
## epsilon "1"
## gamma_scale "2"
## gamma "0.4"
res_rmse[res_rmse$Group==7,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 8 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline__crim_inv_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline__crim_inv_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.86075 -0.42502 -0.07299 0.34222 2.55373
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.140689 0.017385 65.615 < 2e-16 ***
## p_current_age 0.005347 0.001170 4.572 4.90e-06 ***
## p_age_first_offense -0.011605 0.001227 -9.454 < 2e-16 ***
## p_arrest -0.024493 0.002014 -12.160 < 2e-16 ***
## p_jail30 -0.065995 0.039252 -1.681 0.0927 .
## p_prison 0.087091 0.008741 9.964 < 2e-16 ***
## p_probation -0.056627 0.008198 -6.907 5.28e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5531 on 9035 degrees of freedom
## Multiple R-squared: 0.06265, Adjusted R-squared: 0.06202
## F-statistic: 100.6 on 6 and 9035 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==8,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
res_rmse[res_rmse$Group==8,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__age_spline__crim_inv_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==8,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline__crim_inv_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 24
## type "eps-regression"
## cost "2"
## epsilon "1"
## gamma_scale "2"
## gamma "0.2857143"
res_rmse[res_rmse$Group==8,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline__crim_inv_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 0 | 0.5763262 | 0.5197590 | 0.5520113 | 0.5204621 |
| 1 | 0.5704792 | 0.5089603 | 0.5430797 | 0.5125682 |
| 2 | 0.5606448 | 0.4970774 | 0.5118637 | 0.5024905 |
| 3 | 0.5686515 | 0.4987465 | 0.5141315 | 0.5042145 |
| 4 | 0.5587960 | 0.4940864 | 0.5071319 | 0.4961323 |
| 5 | 0.5587960 | 0.4895826 | 0.5061418 | 0.4967642 |
| 6 | 0.5679499 | 0.5520712 | 0.5619801 | 0.5551400 |
| 7 | 0.5580910 | 0.5192577 | 0.5320781 | 0.5221024 |
| 8 | 0.5528691 | 0.4978607 | 0.5112501 | 0.5063758 |
We use the “Group 3” models but predict the raw score and the raw score minus all lower bounds we fitted. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound.
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score) %>% as.matrix(),
"label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(540)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 13
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4953"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("General score") +
ylab("XGBoost prediction")+
#annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_arrest,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline__crim_inv_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline__crim_inv_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline__crim_inv_spline) %>% as.matrix()
)
set.seed(7301)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline__crim_inv_spline
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.4987"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age]~-~g[crim~inv])) +
ylab("XGBoost prediction")+
#annotate("text", x = 0.5, y = 4, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage-gcriminv_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
propub = features_filt %>%
filter(filt4) %>% # Only people with valid recidivism values
mutate(age_low = if_else(p_current_age < 25,1,0),
age_high = if_else(p_current_age > 45,1,0),
female = if_else(sex=="Female",1,0),
n_priors = p_felony_count_person + p_misdem_count_person,
compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
female +
age_high +
age_low +
as.factor(race) +
p_arrest +
is_misdem +
recid,
family=binomial(link='logit'), data=propub)
summary(mdl_glm)
##
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) +
## p_arrest + is_misdem + recid, family = binomial(link = "logit"),
## data = propub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.3690 -0.7420 -0.2762 0.8256 2.7371
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.63647 0.08358 -19.579 < 2e-16 ***
## female 0.18584 0.08647 2.149 0.0316 *
## age_high -1.61708 0.13790 -11.727 < 2e-16 ***
## age_low 1.55708 0.07244 21.495 < 2e-16 ***
## as.factor(race)African-American 0.47448 0.07431 6.385 1.71e-10 ***
## as.factor(race)Asian -0.27638 0.51050 -0.541 0.5882
## as.factor(race)Hispanic -0.26119 0.13121 -1.991 0.0465 *
## as.factor(race)Native American 0.45819 0.66743 0.687 0.4924
## as.factor(race)Other -0.73909 0.16196 -4.563 5.03e-06 ***
## p_arrest 0.30132 0.01224 24.616 < 2e-16 ***
## is_misdem -0.46844 0.07071 -6.625 3.47e-11 ***
## recid 0.50614 0.06974 7.257 3.95e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7907.2 on 5758 degrees of freedom
## Residual deviance: 5492.4 on 5747 degrees of freedom
## AIC: 5516.4
##
## Number of Fisher Scoring iterations: 5